URL :¶

_  : https://www.definitions-marketing.com/definition/segmentation-rfm/
_  : https://www.kaggle.com/code/kautumn06/yellowbrick-clustering-evaluation-examples/notebook
_  : https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html
_  : https://towardsdatascience.com/an-rfm-customer-segmentation-with-python-cf7be647733d
_  : https://towardsdatascience.com/recency-frequency-monetary-model-with-python-and-how-sephora-uses-it-to-optimize-their-google-d6a0707c5f17
- : https://machinelearningmastery.com/calculate-feature-importance-with-python/

RFM¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option("display.max_column", 100)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", None)
import matplotlib.style
plt.style.use('fast')
In [2]:
data = pd.read_csv('clean_dataset.csv')
In [3]:
# List of categorical features
columns_to_convert_to_categorical = [
    'customer_id',
    'customer_unique_id',
    'customer_city',
    'customer_state',
    'order_id',
    'customer_id',
    'order_status',
    'order_id',
    'product_id',
    'seller_id',
    'order_id',
    'payment_type',
    'review_id',
    'order_id',
    'product_id',
    'product_category_name',
    'seller_id',
    'product_category_name'
]
In [4]:
# List of columns to convert to datetime
columns_to_convert_to_datetime = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date',
    'shipping_limit_date',
    'review_creation_date',
    'review_answer_timestamp']

# Converting dtype to datetime features
for column_name in columns_to_convert_to_datetime:
    data[column_name] = data[column_name].astype('datetime64[ns]')
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119143 entries, 0 to 119142
Data columns (total 45 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   Unnamed: 0                     119143 non-null  int64         
 1   order_id                       119143 non-null  object        
 2   customer_id                    119143 non-null  object        
 3   order_status                   119143 non-null  object        
 4   order_purchase_timestamp       119143 non-null  datetime64[ns]
 5   order_approved_at              118966 non-null  datetime64[ns]
 6   order_delivered_carrier_date   117057 non-null  datetime64[ns]
 7   order_delivered_customer_date  115722 non-null  datetime64[ns]
 8   order_estimated_delivery_date  119143 non-null  datetime64[ns]
 9   review_id                      118146 non-null  object        
 10  review_score                   118146 non-null  float64       
 11  review_creation_date           118146 non-null  datetime64[ns]
 12  review_answer_timestamp        118146 non-null  datetime64[ns]
 13  payment_sequential             119140 non-null  float64       
 14  payment_type                   119137 non-null  object        
 15  payment_installments           119140 non-null  float64       
 16  payment_value                  119140 non-null  float64       
 17  customer_unique_id             119143 non-null  object        
 18  customer_city                  119143 non-null  object        
 19  customer_state                 119143 non-null  object        
 20  geolocation_zip_code_prefix    118820 non-null  float64       
 21  customer_lat                   118820 non-null  float64       
 22  customer_lng                   118820 non-null  float64       
 23  geolocation_city               118820 non-null  object        
 24  geolocation_state              118820 non-null  object        
 25  order_item_id                  118310 non-null  float64       
 26  product_id                     118310 non-null  object        
 27  seller_id                      118310 non-null  object        
 28  shipping_limit_date            118310 non-null  datetime64[ns]
 29  price                          118310 non-null  float64       
 30  freight_value                  117920 non-null  float64       
 31  product_name_lenght            116601 non-null  float64       
 32  product_description_lenght     116601 non-null  float64       
 33  product_photos_qty             116601 non-null  float64       
 34  product_weight_g               118290 non-null  float64       
 35  product_length_cm              118290 non-null  float64       
 36  product_height_cm              118290 non-null  float64       
 37  product_width_cm               118290 non-null  float64       
 38  product_category_name          116576 non-null  object        
 39  volume                         118290 non-null  float64       
 40  delivery_time                  115707 non-null  float64       
 41  estimated_delivery_time        118966 non-null  float64       
 42  approval_time                  118966 non-null  float64       
 43  purchase_year                  119143 non-null  int64         
 44  purchase_month                 119143 non-null  int64         
dtypes: datetime64[ns](8), float64(21), int64(3), object(13)
memory usage: 40.9+ MB
In [6]:
data = data.drop(columns=['Unnamed: 0'])
In [7]:
df_missing = data.isna().sum().sort_values()/len(data)*100
df_missing
Out[7]:
order_id                         0.000000
purchase_year                    0.000000
customer_state                   0.000000
customer_city                    0.000000
customer_unique_id               0.000000
order_estimated_delivery_date    0.000000
purchase_month                   0.000000
customer_id                      0.000000
order_purchase_timestamp         0.000000
order_status                     0.000000
payment_value                    0.002518
payment_sequential               0.002518
payment_installments             0.002518
payment_type                     0.005036
approval_time                    0.148561
estimated_delivery_time          0.148561
order_approved_at                0.148561
geolocation_city                 0.271103
customer_lat                     0.271103
customer_lng                     0.271103
geolocation_state                0.271103
geolocation_zip_code_prefix      0.271103
shipping_limit_date              0.699160
price                            0.699160
seller_id                        0.699160
product_id                       0.699160
order_item_id                    0.699160
volume                           0.715946
product_width_cm                 0.715946
product_height_cm                0.715946
product_weight_g                 0.715946
product_length_cm                0.715946
review_creation_date             0.836810
review_answer_timestamp          0.836810
review_score                     0.836810
review_id                        0.836810
freight_value                    1.026498
order_delivered_carrier_date     1.750837
product_photos_qty               2.133571
product_name_lenght              2.133571
product_description_lenght       2.133571
product_category_name            2.154554
order_delivered_customer_date    2.871339
delivery_time                    2.883929
dtype: float64
In [8]:
data.head()
Out[8]:
order_id customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date review_id review_score review_creation_date review_answer_timestamp payment_sequential payment_type payment_installments payment_value customer_unique_id customer_city customer_state geolocation_zip_code_prefix customer_lat customer_lng geolocation_city geolocation_state order_item_id product_id seller_id shipping_limit_date price freight_value product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm product_category_name volume delivery_time estimated_delivery_time approval_time purchase_year purchase_month
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 a54f0611adc9ed256b57ede6b6eb5114 4.0 2017-10-11 2017-10-12 03:43:48 1.0 credit_card 1.0 18.12 7c396fd4830fd04220f754e42b4e5bff sao paulo SP 3149.0 -23.574809 -46.587471 sao paulo SP 1.0 87285b34884572647811a353c7ac498a 3504c0cb71d7fa48d967e0e4c94d59d9 2017-10-06 11:07:15 3.400864 2.165619 40.0 268.0 4.0 500.0 19.0 8.0 13.0 housewares 1976.0 8.429144 15.536632 0.007431 2017 10
1 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 a54f0611adc9ed256b57ede6b6eb5114 4.0 2017-10-11 2017-10-12 03:43:48 3.0 voucher 1.0 2.00 7c396fd4830fd04220f754e42b4e5bff sao paulo SP 3149.0 -23.574809 -46.587471 sao paulo SP 1.0 87285b34884572647811a353c7ac498a 3504c0cb71d7fa48d967e0e4c94d59d9 2017-10-06 11:07:15 3.400864 2.165619 40.0 268.0 4.0 500.0 19.0 8.0 13.0 housewares 1976.0 8.429144 15.536632 0.007431 2017 10
2 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 a54f0611adc9ed256b57ede6b6eb5114 4.0 2017-10-11 2017-10-12 03:43:48 2.0 voucher 1.0 18.59 7c396fd4830fd04220f754e42b4e5bff sao paulo SP 3149.0 -23.574809 -46.587471 sao paulo SP 1.0 87285b34884572647811a353c7ac498a 3504c0cb71d7fa48d967e0e4c94d59d9 2017-10-06 11:07:15 3.400864 2.165619 40.0 268.0 4.0 500.0 19.0 8.0 13.0 housewares 1976.0 8.429144 15.536632 0.007431 2017 10
3 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13 8d5266042046a06655c8db133d120ba5 4.0 2018-08-08 2018-08-08 18:37:50 1.0 boleto 1.0 141.46 af07308b275d755c9edb36a90c618231 barreiras BA 47813.0 -12.169860 -44.988369 barreiras BA 1.0 595fac2a385ac33a80bd5114aec74eb8 289cdb325fb7e7f891c38608bf9e0962 2018-07-30 03:24:27 4.776599 3.125005 29.0 178.0 1.0 400.0 19.0 13.0 19.0 perfumery 4693.0 12.502292 17.858021 1.279745 2018 7
4 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04 e73b67b67587f7644d5bd1a52deb1b01 5.0 2018-08-18 2018-08-22 19:07:58 1.0 credit_card 3.0 179.12 3a653a41f6f9fc3d2a113cf8398680e8 vianopolis GO 75265.0 -16.746337 -48.514624 vianopolis GO 1.0 aa4383b373c6aca5d8797843e5594415 4869f7a5dfa277a7dca6462dcf3b52b2 2018-08-13 08:55:23 5.074549 2.955951 46.0 232.0 1.0 420.0 24.0 19.0 21.0 auto 9576.0 9.382708 26.628206 0.011505 2018 8
In [9]:
#data.loc[:, ['customer_unique_id', "order_id"]]
In [10]:
data['order_id']
Out[10]:
0         e481f51cbdc54678b7cc49136f2d6af7
1         e481f51cbdc54678b7cc49136f2d6af7
2         e481f51cbdc54678b7cc49136f2d6af7
3         53cdb2fc8bc7dce0b6741e2150273451
4         47770eb9100c2d0c44946d9cf07ec65d
                        ...               
119138    63943bddc261676b46f01ca7ac2f7bd8
119139    83c1379a015df1e13d02aae0204711ab
119140    11c177c8e97725db2631073c19f07b62
119141    11c177c8e97725db2631073c19f07b62
119142    66dea50a8b16d9b4dee7af250b4be1a5
Name: order_id, Length: 119143, dtype: object
In [11]:
data["customer_unique_id"]
Out[11]:
0         7c396fd4830fd04220f754e42b4e5bff
1         7c396fd4830fd04220f754e42b4e5bff
2         7c396fd4830fd04220f754e42b4e5bff
3         af07308b275d755c9edb36a90c618231
4         3a653a41f6f9fc3d2a113cf8398680e8
                        ...               
119138    da62f9e57a76d978d02ab5362c509660
119139    737520a9aad80b3fbbdad19b66b37b30
119140    5097a5312c8b157bb7be58ae360ef43c
119141    5097a5312c8b157bb7be58ae360ef43c
119142    60350aa974b26ff12caad89e55993bd6
Name: customer_unique_id, Length: 119143, dtype: object
In [12]:
data['payment_value']
Out[12]:
0          18.12
1           2.00
2          18.59
3         141.46
4         179.12
           ...  
119138    195.00
119139    271.01
119140    441.16
119141    441.16
119142     86.86
Name: payment_value, Length: 119143, dtype: float64
In [13]:
data.set_index('customer_unique_id', inplace=True)
In [14]:
now = data['order_purchase_timestamp'].max()
In [15]:
rfm = data.groupby(['customer_unique_id']).agg({
        'order_purchase_timestamp': lambda x: (now - x.max()).days,
        'order_id':'count',
        'payment_value':'sum'})
    
rfm.rename(columns={'order_purchase_timestamp': 'RecencyScore',
                        'order_id': 'FrequencyScore',
                        'payment_value': 'MonetaryScore'}, inplace=True)
In [16]:
rfm['FrequencyScore'] = rfm['FrequencyScore'][rfm['FrequencyScore'] > 0]
rfm['FrequencyScore'] = np.log(rfm['FrequencyScore'])
In [17]:
rfm['MonetaryScore'] = rfm['MonetaryScore'][rfm['MonetaryScore'] > 0]
rfm['MonetaryScore'] = np.log(rfm['MonetaryScore'])
In [18]:
rfm['MonetaryScore']
Out[18]:
customer_unique_id
0000366f3b9a7992bf8c76cfdf3221e2    4.955123
0000b849f77a49e4a4ce2b2a4ca5be3f    3.302849
0000f46a3911fa3c0805444483337064    4.456902
0000f6ccb0745a6a4b88665a16c9f078    3.775516
0004aac84e0df4da2b147fca70cf8255    5.282645
                                      ...   
fffcf5a5ff07b0908bd4e2dbc735a684    8.327204
fffea47cd6d3cc0a88bd621562a9d061    4.437698
ffff371b4d645b6ecea244b27531430a    4.722598
ffff5962728ec6157033ef9805bacc48    4.895524
ffffd2657e2aad2907e67c3e9daecbeb    4.270536
Name: MonetaryScore, Length: 96096, dtype: float64

Récupération de l'identifiant, de la date et de la valeur de chaque commande. Après jointure sur l'identifiant client on peut calculer les variables RFM

In [19]:
fig = plt.figure(1, figsize=(15, 4))

plt.subplot(1, 3, 1)
sns.distplot(rfm.RecencyScore, label='Recency Score',
                 bins=int(1 + np.log2(len(rfm))))
plt.ylabel("%")

plt.subplot(1, 3, 2)
sns.distplot(rfm.FrequencyScore, label='Frequency Score',
                 bins=int(1 + np.log2(len(rfm))))
plt.ylabel("%")

plt.subplot(1, 3, 3)
sns.distplot(rfm.MonetaryScore, label='Monetary Score',
                 bins=int(1 + np.log2(len(rfm))))
plt.ylabel("%")
plt.style.use('ggplot')
plt.tight_layout()
plt.show()
In [20]:
fig = plt.figure(1, figsize=(18, 4))

plt.subplot(131)
sns.boxplot(rfm["RecencyScore"])

plt.subplot(132)
sns.boxplot(rfm["FrequencyScore"])

plt.subplot(133)
sns.boxplot(rfm["MonetaryScore"])
plt.show()
In [21]:
rfm.describe()
Out[21]:
RecencyScore FrequencyScore MonetaryScore
count 96096.000000 96096.000000 96093.000000
mean 287.735691 0.131395 4.805777
std 153.414676 0.339076 0.918514
min 0.000000 0.000000 2.260721
25% 163.000000 0.000000 4.158883
50% 268.000000 0.000000 4.730392
75% 397.000000 0.000000 5.315322
max 772.000000 4.317488 11.601967
In [22]:
rfm.head(5)
Out[22]:
RecencyScore FrequencyScore MonetaryScore
customer_unique_id
0000366f3b9a7992bf8c76cfdf3221e2 160 0.0 4.955123
0000b849f77a49e4a4ce2b2a4ca5be3f 163 0.0 3.302849
0000f46a3911fa3c0805444483337064 585 0.0 4.456902
0000f6ccb0745a6a4b88665a16c9f078 369 0.0 3.775516
0004aac84e0df4da2b147fca70cf8255 336 0.0 5.282645
In [23]:
rfm.info()
<class 'pandas.core.frame.DataFrame'>
Index: 96096 entries, 0000366f3b9a7992bf8c76cfdf3221e2 to ffffd2657e2aad2907e67c3e9daecbeb
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   RecencyScore    96096 non-null  int64  
 1   FrequencyScore  96096 non-null  float64
 2   MonetaryScore   96093 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 4.9+ MB

Features clustering¶

Le datset est aggrégé par clients unique. Les variables catégorielles ne peuvent pas être aggrégés, on va les supprimer

In [24]:
# Create shipping_time variable
data['shipping_time'] = data['shipping_limit_date'] \
    - data['order_delivered_carrier_date']
data['shipping_time'] = data['shipping_time'] \
    .dt.total_seconds() / 86400

# Create review_delay variable 
data['review_delay'] = data['review_creation_date'] \
    - data['order_purchase_timestamp']
data['review_delay'] = data['review_delay'] \
    .dt.total_seconds() / 86400

# Create review_answer_delay variable 
data['review_answer_delay'] = data['review_answer_timestamp'] \
    - data['review_creation_date']
data['review_answer_delay'] = data['review_answer_delay'] \
    .dt.total_seconds() / 86400
In [25]:
# Features to aggregate
features = ['review_creation_date',
            'review_answer_timestamp',
            'estimated_delivery_time',
            'delivery_time',
            'approval_time',
            'purchase_year',
            'purchase_month',
            'order_purchase_timestamp',
            'payment_sequential',
            'freight_value',
            'price',
            'payment_value',
            'order_item_id',
            'product_weight_g',
            'product_photos_qty',
            'product_name_lenght',
            'volume',
            'shipping_limit_date',
            'order_delivered_carrier_date',
            'shipping_time',
            'review_delay',
            'review_answer_delay'
           ]
In [26]:
model = data.groupby('customer_unique_id').agg({'review_creation_date': "mean",
                                                         'review_answer_timestamp': "mean",
                                                         'estimated_delivery_time': "mean",
                                                         'delivery_time': 'mean',
                                                         'approval_time': 'mean',
                                                         'purchase_year': 'mean',
                                                         'purchase_month': 'mean',
                                                         'order_purchase_timestamp': 'mean',
                                                         'payment_sequential': "sum",
                                                         'freight_value': "sum",
                                                         'price': 'sum',
                                                         'payment_value': "sum",
                                                         'order_item_id': 'sum',
                                                         'product_weight_g': "mean",
                                                         'product_photos_qty': 'mean',
                                                         'product_name_lenght': 'mean',
                                                         'volume': "mean",
                                                         'shipping_limit_date': "mean",
                                                         'order_delivered_carrier_date': "mean",
                                                         'shipping_time':"mean",
                                                         'review_delay':"mean",
                                                         'review_answer_delay':"mean",
                                                        })
In [27]:
model.info()
<class 'pandas.core.frame.DataFrame'>
Index: 96096 entries, 0000366f3b9a7992bf8c76cfdf3221e2 to ffffd2657e2aad2907e67c3e9daecbeb
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   review_creation_date          95380 non-null  datetime64[ns]
 1   review_answer_timestamp       95380 non-null  datetime64[ns]
 2   estimated_delivery_time       95997 non-null  float64       
 3   delivery_time                 93343 non-null  float64       
 4   approval_time                 95997 non-null  float64       
 5   purchase_year                 96096 non-null  float64       
 6   purchase_month                96096 non-null  float64       
 7   order_purchase_timestamp      96096 non-null  datetime64[ns]
 8   payment_sequential            96096 non-null  float64       
 9   freight_value                 96096 non-null  float64       
 10  price                         96096 non-null  float64       
 11  payment_value                 96096 non-null  float64       
 12  order_item_id                 96096 non-null  float64       
 13  product_weight_g              95407 non-null  float64       
 14  product_photos_qty            94108 non-null  float64       
 15  product_name_lenght           94108 non-null  float64       
 16  volume                        95407 non-null  float64       
 17  shipping_limit_date           95420 non-null  datetime64[ns]
 18  order_delivered_carrier_date  94466 non-null  datetime64[ns]
 19  shipping_time                 94465 non-null  float64       
 20  review_delay                  95380 non-null  float64       
 21  review_answer_delay           95380 non-null  float64       
dtypes: datetime64[ns](5), float64(17)
memory usage: 16.9+ MB
In [28]:
model.columns = features
In [29]:
#model = model.join(rfm[['RecencyScore','FrequencyScore','MonetaryScore']])
In [30]:
model.head()
Out[30]:
review_creation_date review_answer_timestamp estimated_delivery_time delivery_time approval_time purchase_year purchase_month order_purchase_timestamp payment_sequential freight_value price payment_value order_item_id product_weight_g product_photos_qty product_name_lenght volume shipping_limit_date order_delivered_carrier_date shipping_time review_delay review_answer_delay
customer_unique_id
0000366f3b9a7992bf8c76cfdf3221e2 2018-05-17 2018-05-21 13:44:48 10.533819 6.400914 0.010313 2018.0 5.0 2018-05-10 10:56:27 1.0 2.484907 4.866765 141.90 1.0 1500.0 1.0 60.0 7616.0 2018-05-15 11:11:18 2018-05-12 08:18:00 3.120347 6.544132 4.572778
0000b849f77a49e4a4ce2b2a4ca5be3f 2018-05-11 2018-05-11 22:29:25 7.232130 2.984005 0.301586 2018.0 5.0 2018-05-07 11:11:27 1.0 2.115050 2.939162 27.19 1.0 375.0 1.0 56.0 5148.0 2018-05-11 17:56:33 2018-05-09 12:18:00 2.235104 3.533715 0.937095
0000f46a3911fa3c0805444483337064 2017-04-06 2017-04-07 01:19:44 27.121493 25.731759 0.000000 2017.0 3.0 2017-03-10 21:05:03 1.0 2.846071 4.234107 86.22 1.0 1500.0 3.0 49.0 43750.0 2017-03-15 21:05:03 2017-03-13 12:58:30 2.337882 26.121493 1.055370
0000f6ccb0745a6a4b88665a16c9f078 2017-11-02 2017-11-03 00:17:51 31.132442 20.023472 0.013611 2017.0 10.0 2017-10-12 20:29:41 1.0 2.869602 3.257712 43.62 1.0 150.0 5.0 43.0 1045.0 2017-10-18 21:49:17 2017-10-13 20:08:19 5.070116 20.146053 1.012396
0004aac84e0df4da2b147fca70cf8255 2017-11-28 2017-12-02 22:29:35 20.161898 13.126435 0.014699 2017.0 11.0 2017-11-14 19:45:42 1.0 2.826722 5.192957 196.89 1.0 6050.0 3.0 58.0 528.0 2017-11-22 20:06:52 2017-11-16 19:52:10 6.010208 13.176597 4.937211
In [31]:
model.info()
<class 'pandas.core.frame.DataFrame'>
Index: 96096 entries, 0000366f3b9a7992bf8c76cfdf3221e2 to ffffd2657e2aad2907e67c3e9daecbeb
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   review_creation_date          95380 non-null  datetime64[ns]
 1   review_answer_timestamp       95380 non-null  datetime64[ns]
 2   estimated_delivery_time       95997 non-null  float64       
 3   delivery_time                 93343 non-null  float64       
 4   approval_time                 95997 non-null  float64       
 5   purchase_year                 96096 non-null  float64       
 6   purchase_month                96096 non-null  float64       
 7   order_purchase_timestamp      96096 non-null  datetime64[ns]
 8   payment_sequential            96096 non-null  float64       
 9   freight_value                 96096 non-null  float64       
 10  price                         96096 non-null  float64       
 11  payment_value                 96096 non-null  float64       
 12  order_item_id                 96096 non-null  float64       
 13  product_weight_g              95407 non-null  float64       
 14  product_photos_qty            94108 non-null  float64       
 15  product_name_lenght           94108 non-null  float64       
 16  volume                        95407 non-null  float64       
 17  shipping_limit_date           95420 non-null  datetime64[ns]
 18  order_delivered_carrier_date  94466 non-null  datetime64[ns]
 19  shipping_time                 94465 non-null  float64       
 20  review_delay                  95380 non-null  float64       
 21  review_answer_delay           95380 non-null  float64       
dtypes: datetime64[ns](5), float64(17)
memory usage: 16.9+ MB
In [32]:
# percentage of nan for each column of data
df_missing = model.isna().sum().sort_values()/len(data)*100
df_missing
Out[32]:
price                           0.000000
purchase_year                   0.000000
purchase_month                  0.000000
order_purchase_timestamp        0.000000
payment_sequential              0.000000
freight_value                   0.000000
payment_value                   0.000000
order_item_id                   0.000000
estimated_delivery_time         0.083093
approval_time                   0.083093
shipping_limit_date             0.567385
volume                          0.578297
product_weight_g                0.578297
review_creation_date            0.600959
review_answer_timestamp         0.600959
review_delay                    0.600959
review_answer_delay             0.600959
order_delivered_carrier_date    1.368104
shipping_time                   1.368943
product_photos_qty              1.668583
product_name_lenght             1.668583
delivery_time                   2.310669
dtype: float64
In [33]:
sample = model.join(rfm[['RecencyScore','FrequencyScore','MonetaryScore']])
In [34]:
sample
Out[34]:
review_creation_date review_answer_timestamp estimated_delivery_time delivery_time approval_time purchase_year purchase_month order_purchase_timestamp payment_sequential freight_value price payment_value order_item_id product_weight_g product_photos_qty product_name_lenght volume shipping_limit_date order_delivered_carrier_date shipping_time review_delay review_answer_delay RecencyScore FrequencyScore MonetaryScore
customer_unique_id
0000366f3b9a7992bf8c76cfdf3221e2 2018-05-17 2018-05-21 13:44:48 10.533819 6.400914 0.010313 2018.0 5.0 2018-05-10 10:56:27 1.0 2.484907 4.866765 141.90 1.0 1500.0 1.0 60.0 7616.0 2018-05-15 11:11:18 2018-05-12 08:18:00 3.120347 6.544132 4.572778 160 0.000000 4.955123
0000b849f77a49e4a4ce2b2a4ca5be3f 2018-05-11 2018-05-11 22:29:25 7.232130 2.984005 0.301586 2018.0 5.0 2018-05-07 11:11:27 1.0 2.115050 2.939162 27.19 1.0 375.0 1.0 56.0 5148.0 2018-05-11 17:56:33 2018-05-09 12:18:00 2.235104 3.533715 0.937095 163 0.000000 3.302849
0000f46a3911fa3c0805444483337064 2017-04-06 2017-04-07 01:19:44 27.121493 25.731759 0.000000 2017.0 3.0 2017-03-10 21:05:03 1.0 2.846071 4.234107 86.22 1.0 1500.0 3.0 49.0 43750.0 2017-03-15 21:05:03 2017-03-13 12:58:30 2.337882 26.121493 1.055370 585 0.000000 4.456902
0000f6ccb0745a6a4b88665a16c9f078 2017-11-02 2017-11-03 00:17:51 31.132442 20.023472 0.013611 2017.0 10.0 2017-10-12 20:29:41 1.0 2.869602 3.257712 43.62 1.0 150.0 5.0 43.0 1045.0 2017-10-18 21:49:17 2017-10-13 20:08:19 5.070116 20.146053 1.012396 369 0.000000 3.775516
0004aac84e0df4da2b147fca70cf8255 2017-11-28 2017-12-02 22:29:35 20.161898 13.126435 0.014699 2017.0 11.0 2017-11-14 19:45:42 1.0 2.826722 5.192957 196.89 1.0 6050.0 3.0 58.0 528.0 2017-11-22 20:06:52 2017-11-16 19:52:10 6.010208 13.176597 4.937211 336 0.000000 5.282645
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
fffcf5a5ff07b0908bd4e2dbc735a684 2017-07-07 2017-07-11 11:20:05 54.114398 27.504676 0.010185 2017.0 6.0 2017-06-08 21:00:36 2.0 11.031093 13.313314 4134.84 3.0 30000.0 1.0 54.0 251625.0 2017-06-25 21:15:16 2017-06-23 13:03:27 2.341539 28.124583 4.472280 495 0.693147 8.327204
fffea47cd6d3cc0a88bd621562a9d061 2018-01-10 2018-01-15 11:51:59 32.155324 30.091667 0.005833 2017.0 12.0 2017-12-10 20:07:56 1.0 2.980111 4.172694 84.58 1.0 1720.0 2.0 37.0 32076.0 2017-12-14 20:16:20 2017-12-21 16:27:01 -6.840752 30.161157 5.494433 310 0.000000 4.437698
ffff371b4d645b6ecea244b27531430a 2017-02-23 2017-02-25 15:03:21 44.331435 14.862731 0.009352 2017.0 2.0 2017-02-07 15:49:16 1.0 3.116178 4.498698 112.46 1.0 600.0 5.0 57.0 3315.0 2017-02-11 15:49:16 2017-02-08 10:04:37 3.239340 15.340787 2.627326 617 0.000000 4.722598
ffff5962728ec6157033ef9805bacc48 2018-05-15 2018-05-17 15:16:20 25.334178 11.830313 0.028542 2018.0 5.0 2018-05-02 15:17:41 1.0 2.927989 4.744932 133.69 1.0 444.0 3.0 35.0 4480.0 2018-05-08 15:58:47 2018-05-03 16:45:00 4.967905 12.362720 2.636343 168 0.000000 4.895524
ffffd2657e2aad2907e67c3e9daecbeb 2017-05-11 2017-05-13 10:55:24 22.364433 6.762905 0.789213 2017.0 5.0 2017-05-02 20:18:45 1.0 2.678965 4.042876 71.56 1.0 400.0 2.0 45.0 2178.0 2017-05-10 15:15:13 2017-05-04 12:05:22 6.131840 8.153646 2.455139 532 0.000000 4.270536

96096 rows × 25 columns

Echantillonnage¶

In [35]:
# 5% selection of a sample 
sample = rfm.sample(frac=1)
In [36]:
sample.head(50)
Out[36]:
RecencyScore FrequencyScore MonetaryScore
customer_unique_id
cfe59c8aa4d4bd4344db92d818d04c1f 69 0.000000 3.837946
6b6c763cc4d02614d74b0c372127fe13 517 0.000000 4.546481
e6d08c276c45a22381ec076deeb63b13 257 0.000000 4.625561
0f1190ffa87d71271b42b9df8245258f 626 0.000000 4.969813
448e0dea390e9ed20f330b7e75b12ff3 371 0.000000 4.246207
65c2b54b418a73fcb36a6b6e6765dd23 431 0.693147 5.842384
c06fd806752b504eca5cea51d4bc0c4c 184 1.386294 5.449492
4f94b2a128e75939c61c188212388e3e 503 0.000000 7.252493
10c3a1c4d4262a949a536157ac3c4f6d 67 0.000000 5.722146
4b0f1a28c6fd7af4705d217c728f22de 74 0.000000 5.951814
12728eaaef44ecf87a76083fd1e442c3 345 0.693147 5.248917
f7b223e244004385231a6e4db45a06a9 258 0.000000 4.636087
dfdcc1f8c27500c34a9fc70695b1b1e4 138 0.000000 5.507321
c8ec99d9532ed61578f3717cb0eff87c 229 0.000000 3.988984
4640720dd73950599ff10a468a10d61e 174 0.000000 5.562872
0becec3df826cb50f107c7a99cb44b90 637 0.000000 6.607488
7857512dc9a14ef806537f0443fd5d9e 246 0.000000 3.924544
bd7287fdad74d4d53c8d2b6e1cfd644d 156 0.000000 6.527329
81eb2077986bca5eba40d0796490073a 199 0.000000 4.334542
f84a99a71347da8c4956f56a987211ba 519 0.000000 4.863913
642ac36382354363985c6bfdba639c42 249 0.000000 3.625673
87307bedec2b4496fc1fac01d1d98a20 219 0.000000 4.670302
2f76c8ac8310a38b82baafddbf090db8 77 0.000000 4.689787
2542d967894acdc11dbffa8dd2e82a6a 154 1.098612 6.505470
2e6922b1b345f1606b329d24f3d09136 381 0.693147 3.397189
6861be7c47819049f96ce098fa072e3a 212 0.000000 4.374246
628bf7b073cfff27a6d6ad849c29a1f2 245 0.000000 6.315087
f6fccceee418fcc63939b6bb5e20128d 155 0.000000 3.691875
4ddabde26cd4b8378ccbdc76daf8b745 600 0.000000 4.396176
c861accb34a721194cc29e7a7e5fe5f0 309 0.000000 4.331785
787a5c4e16118ce0267bb67520cf8179 366 0.000000 4.175925
afa602f01babaaaa3376444634400a25 272 0.000000 4.535820
f79aa45136d8243492e2eaa67f203425 171 0.000000 4.746236
56f154706913672dc7f0c7350db0dda2 165 0.000000 4.720729
5a465fa8d582b46073747a7a0dafb516 241 0.000000 3.613617
d9a99b290182aff8db93595d904ab99c 625 0.693147 6.070276
b2387b8bb8016bffddb4ff61265a4fd4 406 0.000000 4.666171
a740a724bc243698bdbf93a8dd68a09a 305 0.000000 4.945136
edd176efa31cbfc929efbac163cb45a6 368 0.693147 6.724025
5cee614881d2b4cccc3378dcc933578d 349 0.000000 3.593194
fbb8b4ad21213ec7a23c867b6bcdf843 599 0.000000 3.822973
981665ad276011294ada8eddaaa56d8f 333 0.000000 5.506469
9b3b3eaa63df7cc2909934af6d2034ba 245 0.000000 4.667018
5a4b4cc7bcccb91c0eba9112d7c4c8bd 253 0.000000 4.879083
947c620fd333114daef626378bb14848 326 0.000000 4.780803
42d8b87cd525a34a306cca6db06d83a1 386 0.000000 5.313895
c843b255e98911eb5a8c0062e6b0894b 293 0.000000 5.335854
d07abcc5931dc246719c4cb0c7fd53f0 185 0.000000 3.193763
c326cd6e78ae9b11f8fee1bb7ac9b441 126 0.000000 4.025887
076ad5b91ba9b470ed7126b6f22fe201 171 0.000000 4.367040
In [37]:
sample.info()
<class 'pandas.core.frame.DataFrame'>
Index: 96096 entries, cfe59c8aa4d4bd4344db92d818d04c1f to 4cc93c73069ea5ba716248771a30fffb
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   RecencyScore    96096 non-null  int64  
 1   FrequencyScore  96096 non-null  float64
 2   MonetaryScore   96093 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 2.9+ MB

Imputers, transformers and pipeline¶

In [38]:
# Imputers
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer


# Transformers
from sklearn.decomposition import PCA
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
In [39]:
# Separation into numerical and categorical features
numerical_features = sample.select_dtypes(include=['int64', 'float64']).columns
categorical_features = sample.select_dtypes(include=['object', 'bool']).columns

pca = PCA(n_components=2) X_pca = pca.fit_transform(sample) total_explained_variance = pca.explained_varianceratio.cumsum()

In [40]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
    #('PCA', PCA(n_components=3)),
    #('std_scaler', RobustScaler()),
    #('std_scaler', MinMaxScaler()),
    #('QuantileTransformer', QuantileTransformer()),
])
In [41]:
preprocessing_pipeline = Pipeline(steps=[('numerical_transformer', numerical_transformer)])
In [42]:
index = sample.index
print(index)
Index(['cfe59c8aa4d4bd4344db92d818d04c1f', '6b6c763cc4d02614d74b0c372127fe13',
       'e6d08c276c45a22381ec076deeb63b13', '0f1190ffa87d71271b42b9df8245258f',
       '448e0dea390e9ed20f330b7e75b12ff3', '65c2b54b418a73fcb36a6b6e6765dd23',
       'c06fd806752b504eca5cea51d4bc0c4c', '4f94b2a128e75939c61c188212388e3e',
       '10c3a1c4d4262a949a536157ac3c4f6d', '4b0f1a28c6fd7af4705d217c728f22de',
       ...
       '0e8750e4ffa3dc86470cdfa30237f867', 'a9634ba777001268fd432bfdf0f21a10',
       'b4a365450bf1efc7d7935fb4d838c3e7', '5270cffd29ac266ad0acc6bf4261321c',
       '049978bae2febc352587dab45fb38d3a', '86466e264fde8cca4fd07bec8d83fe96',
       'bb5b5f40144184d10b184574c513fcd3', '89baf783e8bf8766ee306ad8aa15ef2a',
       '14f5335a43420b92aa77e1f35ada3f42', '4cc93c73069ea5ba716248771a30fffb'],
      dtype='object', name='customer_unique_id', length=96096)
In [43]:
columns = sample.columns
print(columns)
Index(['RecencyScore', 'FrequencyScore', 'MonetaryScore'], dtype='object')
In [44]:
sample.iloc[:,0]
Out[44]:
customer_unique_id
cfe59c8aa4d4bd4344db92d818d04c1f     69
6b6c763cc4d02614d74b0c372127fe13    517
e6d08c276c45a22381ec076deeb63b13    257
0f1190ffa87d71271b42b9df8245258f    626
448e0dea390e9ed20f330b7e75b12ff3    371
                                   ... 
86466e264fde8cca4fd07bec8d83fe96    390
bb5b5f40144184d10b184574c513fcd3     60
89baf783e8bf8766ee306ad8aa15ef2a    215
14f5335a43420b92aa77e1f35ada3f42    164
4cc93c73069ea5ba716248771a30fffb     70
Name: RecencyScore, Length: 96096, dtype: int64
In [45]:
sample.iloc[:,1]
Out[45]:
customer_unique_id
cfe59c8aa4d4bd4344db92d818d04c1f    0.0
6b6c763cc4d02614d74b0c372127fe13    0.0
e6d08c276c45a22381ec076deeb63b13    0.0
0f1190ffa87d71271b42b9df8245258f    0.0
448e0dea390e9ed20f330b7e75b12ff3    0.0
                                   ... 
86466e264fde8cca4fd07bec8d83fe96    0.0
bb5b5f40144184d10b184574c513fcd3    0.0
89baf783e8bf8766ee306ad8aa15ef2a    0.0
14f5335a43420b92aa77e1f35ada3f42    0.0
4cc93c73069ea5ba716248771a30fffb    0.0
Name: FrequencyScore, Length: 96096, dtype: float64
In [46]:
sample.iloc[:,2]
Out[46]:
customer_unique_id
cfe59c8aa4d4bd4344db92d818d04c1f    3.837946
6b6c763cc4d02614d74b0c372127fe13    4.546481
e6d08c276c45a22381ec076deeb63b13    4.625561
0f1190ffa87d71271b42b9df8245258f    4.969813
448e0dea390e9ed20f330b7e75b12ff3    4.246207
                                      ...   
86466e264fde8cca4fd07bec8d83fe96    3.654029
bb5b5f40144184d10b184574c513fcd3    5.506632
89baf783e8bf8766ee306ad8aa15ef2a    3.696103
14f5335a43420b92aa77e1f35ada3f42    7.702204
4cc93c73069ea5ba716248771a30fffb    4.101320
Name: MonetaryScore, Length: 96096, dtype: float64
In [47]:
sample_scaled = preprocessing_pipeline.fit_transform(sample)
#sample_scaled = pd.DataFrame(sample_scaled, index = sample.index)
#sample_scaled = pd.DataFrame(sample_scaled,  index = sample.index, columns = sample.columns)
In [48]:
fig=plt.figure(figsize=(5,5))
plt.scatter(sample_scaled[:,0], sample_scaled[:,1])
Out[48]:
<matplotlib.collections.PathCollection at 0x11d04532400>
In [49]:
sample_scaled.shape
Out[49]:
(96096, 3)
In [50]:
#print(sample_scaled)
In [51]:
sample_scaled[:,0].shape
Out[51]:
(96096,)
In [52]:
sample_scaled[:,1].shape
Out[52]:
(96096,)

Métriques de Kmeans¶

In [53]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import davies_bouldin_score

sse, slc, db = {}, {}, {}
for i in range(2, 7):
    kmeans = KMeans(init='k-means++', n_clusters=i, random_state=10, max_iter=400, n_init=50).fit(sample_scaled)
    clusters = kmeans.labels_
    sse[i] = kmeans.inertia_
    slc[i] = silhouette_score(sample_scaled,clusters)
    db[i] = davies_bouldin_score(sample_scaled,clusters)
In [54]:
# Graphs for each metric

fig, (ax0, ax1, ax2) = plt.subplots(1,3, figsize = (25, 5))

ax0.plot(list(sse.keys()), list(sse.values()), color="blue", marker='o')
ax0.set_xlabel("number of cluster")
ax0.set_ylabel("inertia")

ax1.plot(list(slc.keys()), list(slc.values()), color="gold", marker='o')
ax1.set_xlabel("number of cluster")
ax1.set_ylabel("silhouette score")

ax2.plot(list(db.keys()), list(db.values()), color="red", marker='o')
ax2.set_xlabel("Number of cluster")
ax2.set_ylabel("Davies-Bouldin score")
Out[54]:
Text(0, 0.5, 'Davies-Bouldin score')

Kmeans et graphs TSNE¶

In [55]:
from sklearn.manifold import TSNE
import matplotlib.cm as cm
n_clusters=4

sample_scaled = preprocessing_pipeline.fit_transform(sample)
fig, ax2 = plt.subplots(1)
fig.set_size_inches(10, 10)


clusterer = KMeans(n_clusters=n_clusters, random_state=10, max_iter=400, n_init=50)
cluster_labels = clusterer.fit_predict(sample_scaled)
labels_scale = clusterer.labels_

# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed clusters
    
silhouette_avg = silhouette_score(sample_scaled, cluster_labels)
print("for n clusters =", n_clusters, "average silhouette_score is :", silhouette_avg)

# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(sample_scaled, cluster_labels)

# 2nd Plot showing the actual clusters formed


reducer1=TSNE(n_components=2)
data_reduced1 = reducer1.fit_transform(sample_scaled)


sns.scatterplot(data_reduced1[:,0], data_reduced1[:,1], palette='deep', hue=cluster_labels, legend="full",alpha=0.7,ax=ax2)


ax2.set_title("clustered data")
ax2.set_xlabel("feature space for 1st feature")
ax2.set_ylabel("feature space for 2nd feature")
plt.suptitle(("TSNE for KMeans clustering"
                  "with n clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()
for n clusters = 4 average silhouette_score is : 0.34016719280744995
In [56]:
from sklearn.cluster import KMeans
# Définitiondu dataframe preprocessed
sample_scaled = preprocessing_pipeline.fit_transform(sample)
df_sample_scaled = pd.DataFrame(sample_scaled,
                                 index=sample.index,
                                 columns=sample.columns)


n_clusters=4

clusterer = KMeans(n_clusters=n_clusters, random_state=10, max_iter=400, n_init=50)
cluster_labels = clusterer.fit_predict(sample_scaled)
labels_scale = clusterer.labels_

# Dataframe of identified clusters
df_cluster_labels = pd.DataFrame(labels_scale, index = df_sample_scaled.index)
df_cluster_labels.columns = ['cluster_labels']
In [57]:
# Distribution of clients in clusters
sample_clustered = df_sample_scaled.join(df_cluster_labels)
sample_clustered['cluster_labels'].value_counts(normalize=True).sort_index().plot.bar(figsize=(9, 4))
Out[57]:
<AxesSubplot:>
In [58]:
# Distribution of clients in clusters
sample_clustered['cluster_labels'].value_counts().sort_index()/len(sample_clustered)*100
Out[58]:
0    33.844281
1    14.060939
2    25.979229
3    26.115551
Name: cluster_labels, dtype: float64
In [59]:
df_test = sample_clustered.groupby(['cluster_labels']).agg({
        'RecencyScore': 'size',
        'FrequencyScore':'size',
        'MonetaryScore':'size'})
In [60]:
df_test.sort_index().plot.bar(figsize=(9, 4))
Out[60]:
<AxesSubplot:xlabel='cluster_labels'>

DBSCAN¶

In [66]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.manifold import TSNE
import matplotlib.cm as cm


sample_scaled = preprocessing_pipeline.fit_transform(sample)
fig, ax2 = plt.subplots(1)
fig.set_size_inches(10, 10)


# Compute DBSCAN
db = DBSCAN(eps=0.05, min_samples=10).fit(sample_scaled)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_


# 2nd Plot showing the actual clusters formed


reducer1=TSNE(n_components=2)
data_reduced1 = reducer1.fit_transform(sample_scaled)

sns.scatterplot(data_reduced1[:,0], data_reduced1[:,1], palette='deep', hue=labels, legend="full",alpha=0.7,ax=ax2)


ax2.set_title("clustered data")
ax2.set_xlabel("feature space for 1st feature")
ax2.set_ylabel("feature space for 2nd feature")
plt.suptitle("TSNE for DBSCAN", fontsize=14, fontweight='bold')

plt.show()

variables latentes des données et partitionnement avec un classifier¶

In [67]:
# Définitiondu dataframe preprocessed
sample_imputed = preprocessing_pipeline.fit_transform(sample)
df_sample_imputed = pd.DataFrame(sample_imputed,
                                 index=sample.index,
                                 columns=sample.columns)
In [68]:
sample_scaled = preprocessing_pipeline.fit_transform(sample)
sample_scaled = pd.DataFrame(sample_scaled,  index = sample.index, columns = sample.columns)
n_clusters=4

clusterer = KMeans(n_clusters=n_clusters, random_state=10, max_iter=400, n_init=50)
cluster_labels = clusterer.fit_predict(sample_scaled)
labels_scale = clusterer.labels_

# Dataframe of identified clusters
df_cluster_labels = pd.DataFrame(cluster_labels, index = sample_scaled.index)
df_cluster_labels.columns = ['cluster_labels']

clustering hiérarchique¶

In [69]:
# hierachical clustering for flexibiliy and small ammount of features (<50)
# random forest for feature importance on a classification problem 
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot

X, y = df_sample_imputed, df_cluster_labels
# define the model
classeur = RandomForestClassifier()
# fit the model
classeur.fit(X, y)
# get importance
importance = classeur.feature_importances_
df_importance = pd.DataFrame(importance,
                             index=X.columns,
                             )
df_importance = df_importance.rename(columns={0: 'importance'})
df_importance = df_importance.sort_values(by = ['importance'], ascending=False)
# plot feature importance
 # .nlargest(4) # pandas Series method which will return a subset of the series with the largest n values.

sns.barplot(x='importance',
            y=df_importance.index, data=df_importance)
C:\Users\tropical noise\AppData\Local\Temp\ipykernel_11864\1449054764.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  classeur.fit(X, y)
Out[69]:
<AxesSubplot:xlabel='importance'>
In [70]:
# Distribution of clients in clusters
sample_clustered['cluster_labels'].value_counts().sort_index()/len(sample_clustered)*100
Out[70]:
0    33.844281
1    14.060939
2    25.979229
3    26.115551
Name: cluster_labels, dtype: float64
In [71]:
# Parallel Coordinates Plots pour la caractérisation des clusters

import plotly.graph_objects as go

fig = go.Figure(data=
    go.Parcoords(
        line = dict(color = sample_clustered['cluster_labels'],
                   colorscale = [[0,'red'],[0.1,'blue'],[0.2,'yellow'], [0.3,'green'],[0.4,'darkgreen'],
                                 [0.5,'orange'], [0.6,'purple'],[0.7,'lightblue'],[1,'gold']]),
        
        dimensions = list([
            dict(range = [sample_clustered['RecencyScore'].min(),sample_clustered['RecencyScore'].max()],
                 #constraintrange = [100000,150000],
                 label = "RecencyScore", values = sample_clustered['RecencyScore']),
            
            dict(range = [-1,6],
                 tickvals = [0,1,2,3,4,5],
                 label = 'cluster_labels', values = sample_clustered['cluster_labels']),
            
            dict(range = [sample_clustered['FrequencyScore'].min(),sample_clustered['FrequencyScore'].max()],
                 #constraintrange = [100000,150000],
                 label = "FrequencyScore", values = sample_clustered['FrequencyScore']),
            
            dict(range = [sample_clustered['MonetaryScore'].min(),sample_clustered['MonetaryScore'].max()],
                 #constraintrange = [100000,150000],
                 label = "MonetaryScore", values = sample_clustered['MonetaryScore'])])  
    )
)

fig.update_layout(
    plot_bgcolor = 'white',
    paper_bgcolor = 'white'
)

fig.show()
In [72]:
# Valeurs moyennes par variables et par clusters
column_names = list(sample_clustered.columns)

df_mean = pd.DataFrame(columns = column_names)

for i in range(4):
    df_mean = df_mean.append((pd.DataFrame(sample_clustered[sample_clustered['cluster_labels']==i].mean())).transpose())
In [73]:
df_mean.head()
Out[73]:
RecencyScore FrequencyScore MonetaryScore cluster_labels
0 -0.58871 -0.361765 -0.781065 0.0
0 -0.045746 2.219526 1.12246 1.0
0 -0.46266 -0.38751 0.690989 2.0
0 1.24781 -0.340706 -0.279512 3.0
In [74]:
df_mean.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 0
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   RecencyScore    4 non-null      object
 1   FrequencyScore  4 non-null      object
 2   MonetaryScore   4 non-null      object
 3   cluster_labels  4 non-null      object
dtypes: object(4)
memory usage: 160.0+ bytes
In [75]:
df_mean['MonetaryScore'].min()
Out[75]:
-0.7810652642820877
In [76]:
# Parallel Coordinates Plots pour la caractérisation des clusters

import plotly.graph_objects as go

fig = go.Figure(data=
    go.Parcoords(
        line = dict(color = df_mean['cluster_labels'],
                   colorscale = [[0,'red'],[0.1,'blue'],[0.2,'yellow'], [0.3,'green'],[0.4,'darkgreen'],
                                 [0.5,'orange'], [0.6,'purple'],[0.7,'lightblue'],[1,'gold']]),
        
        dimensions = list([
            
            dict(range = [0,4],
                 tickvals = [0,1,2,3],
                 label = 'cluster_labels', values = df_mean['cluster_labels']),
            
            dict(range = [sample_clustered['MonetaryScore'].min(),sample_clustered['MonetaryScore'].max()],
                 label = "MonetaryScore", values = sample_clustered['MonetaryScore'].astype('int64')),
            
            dict(range = [df_mean['FrequencyScore'].min(),df_mean['FrequencyScore'].max()],
                 label = "FrequencyScore", values = df_mean['FrequencyScore'].astype('int64')),
            
            dict(range = [sample_clustered['RecencyScore'].min(),sample_clustered['RecencyScore'].max()],
                 label = "RecencyScore", values = sample_clustered['RecencyScore'].astype('int64'))])

    )
)

fig.update_layout(
    plot_bgcolor = 'white',
    paper_bgcolor = 'white'
)

fig.show()

Stabilité du Kmeans dans le temps¶

In [77]:
rfm.head()
Out[77]:
RecencyScore FrequencyScore MonetaryScore
customer_unique_id
0000366f3b9a7992bf8c76cfdf3221e2 160 0.0 4.955123
0000b849f77a49e4a4ce2b2a4ca5be3f 163 0.0 3.302849
0000f46a3911fa3c0805444483337064 585 0.0 4.456902
0000f6ccb0745a6a4b88665a16c9f078 369 0.0 3.775516
0004aac84e0df4da2b147fca70cf8255 336 0.0 5.282645
In [78]:
rfm.info()
<class 'pandas.core.frame.DataFrame'>
Index: 96096 entries, 0000366f3b9a7992bf8c76cfdf3221e2 to ffffd2657e2aad2907e67c3e9daecbeb
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   RecencyScore    96096 non-null  int64  
 1   FrequencyScore  96096 non-null  float64
 2   MonetaryScore   96093 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 4.9+ MB
In [79]:
sample_scaled = preprocessing_pipeline.fit_transform(rfm)

# transformation into dataframe
df_customer_dataset_reduced = pd.DataFrame(sample_scaled, 
                                    index = rfm.index, 
                                    columns = rfm.columns)

df_customer_dataset_reduced_time = pd.merge(model[['purchase_year', 'purchase_month']], 
                                            df_customer_dataset_reduced, 
                                            left_index=True, right_index=True,  how = 'outer')
In [80]:
n_clusters=4
clusterer = KMeans(n_clusters=n_clusters, random_state=10, max_iter=400, n_init=50)
cluster_labels = clusterer.fit_predict(sample_scaled)
labels_scale = clusterer.labels_

cluster_labels = clusterer.fit_predict(sample_scaled)
df_cluster_labels = pd.DataFrame(cluster_labels, index = df_customer_dataset_reduced.index)
df_cluster_labels.rename(columns={0: "clusters"}, inplace = True)
In [81]:
df_customer_dataset_reduced_time.info()
<class 'pandas.core.frame.DataFrame'>
Index: 96096 entries, 0000366f3b9a7992bf8c76cfdf3221e2 to ffffd2657e2aad2907e67c3e9daecbeb
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   purchase_year   96096 non-null  float64
 1   purchase_month  96096 non-null  float64
 2   RecencyScore    96096 non-null  float64
 3   FrequencyScore  96096 non-null  float64
 4   MonetaryScore   96096 non-null  float64
dtypes: float64(5)
memory usage: 6.4+ MB
In [82]:
from sklearn.metrics.cluster import adjusted_rand_score
# list of purchase_year and purchase_month
year =  (df_customer_dataset_reduced_time.purchase_year.astype(int).unique()).tolist() 
month =  (df_customer_dataset_reduced_time.purchase_month.astype(int).unique()).tolist()

datasets = []
ari = []

# for each month of each year we extract the corresponding part of the dataset and we store it in a list of monthly datasets
for y in year:
    for m in month:
        data = df_customer_dataset_reduced_time.loc[(df_customer_dataset_reduced_time.purchase_year == y)
                                                    & (df_customer_dataset_reduced_time.purchase_month == m),:]
        datasets.append(data)
        
df_merge = []

# for each element of the monthly dataset list, calculate the ARI score between the labels of a monthly dataset 
# and those of the global dataset for the same period

for k in range(len(datasets)):
    df_merge.append(datasets[k])
    if len(df_merge) == 1 :
        df_final = df_merge[0]
        if len(df_final) >= 4:
            df_final.drop(['purchase_year', 'purchase_month'], axis = 1 , inplace = True)
            clusterer.fit(df_final)
            labels_true = df_cluster_labels.loc[df_final.index, 'clusters'].tolist()
            ari.append(adjusted_rand_score(labels_true, clusterer.labels_))
    else:
        df_final = pd.concat(df_merge)
        if len(df_final) >= 4:
            df_final.drop(['purchase_year', 'purchase_month'], axis = 1 , inplace = True)
            clusterer.fit(df_final)
            labels_true = df_cluster_labels.loc[df_final.index, 'clusters'].tolist()
            ari.append(adjusted_rand_score(labels_true, clusterer.labels_))

ari.reverse()

#plot of results
fig = plt.figure(figsize=(16, 4))
plt.plot(range(len(ari)), ari, marker='o')
plt.xlabel('nombre de mois')
plt.ylabel('indice ARI')
C:\Users\tropical noise\AppData\Local\Temp\ipykernel_11864\96580728.py:26: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[82]:
Text(0, 0.5, 'indice ARI')